In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy as sp
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [7]:
df = pd.DataFrame(pd.read_csv("downloads/diabetes.csv"))
In [11]:
#displaying the head of dataset
df.head(6) 
Out[11]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
5 5 116 74 0 0 25.6 0.201 30 0
In [13]:
#description of dataset
df.describe() 
Out[13]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
In [15]:
df.columns
Out[15]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
In [17]:
df.shape
Out[17]:
(768, 9)
In [21]:
#checking for null values
df.isnull().sum()
Out[21]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [23]:
df.isnull().any()
Out[23]:
Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool
In [25]:
df.isnull().all()
Out[25]:
Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool
In [49]:
#It is better to replace zeros with nan since after that counting them would be easier and zeros need to be replaced with suitable values
df_new = df.copy(deep = True)
df_new[['Glucose','BloodPressure','SkinThickness','Insulin','BMI', 'DiabetesPedigreeFunction','Age']] = df_new[['Glucose','BloodPressure','SkinThickness','Insulin','BMI', 'DiabetesPedigreeFunction','Age']].replace(0,np.NaN)

## showing the count of Nans
print(df_new.isnull().sum())
     
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64
In [35]:
df_new = df.std()
In [37]:
df_new
Out[37]:
Pregnancies                   3.369578
Glucose                      31.972618
BloodPressure                19.355807
SkinThickness                15.952218
Insulin                     115.244002
BMI                           7.884160
DiabetesPedigreeFunction      0.331329
Age                          11.760232
Outcome                       0.476951
dtype: float64
In [39]:
df_new.mean()
Out[39]:
22.927432797872843
In [41]:
df_new.isnull().any()
Out[41]:
False
In [66]:
# EDA
In [68]:
df.corr() 
Out[68]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
Pregnancies 1.000000 0.129459 0.141282 -0.081672 -0.073535 0.017683 -0.033523 0.544341 0.221898
Glucose 0.129459 1.000000 0.152590 0.057328 0.331357 0.221071 0.137337 0.263514 0.466581
BloodPressure 0.141282 0.152590 1.000000 0.207371 0.088933 0.281805 0.041265 0.239528 0.065068
SkinThickness -0.081672 0.057328 0.207371 1.000000 0.436783 0.392573 0.183928 -0.113970 0.074752
Insulin -0.073535 0.331357 0.088933 0.436783 1.000000 0.197859 0.185071 -0.042163 0.130548
BMI 0.017683 0.221071 0.281805 0.392573 0.197859 1.000000 0.140647 0.036242 0.292695
DiabetesPedigreeFunction -0.033523 0.137337 0.041265 0.183928 0.185071 0.140647 1.000000 0.033561 0.173844
Age 0.544341 0.263514 0.239528 -0.113970 -0.042163 0.036242 0.033561 1.000000 0.238356
Outcome 0.221898 0.466581 0.065068 0.074752 0.130548 0.292695 0.173844 0.238356 1.000000
In [72]:
#A **heat map** is a two-dimensional representation of information with the help of colors. Heat maps can help the user visualize simple or complex information.
#correlation
# we can see skin thickness,insulin,pregnencies and age are full independent to each other
#age and pregencies has negative correlation

sns.heatmap(df.corr(),annot = True)
Out[72]:
<Axes: >
No description has been provided for this image
In [74]:
#Histogram
df.hist(figsize = (10,10))
plt.show()
No description has been provided for this image
In [76]:
sns.set(style="ticks")
sns.pairplot(df, hue="Outcome")
Out[76]:
<seaborn.axisgrid.PairGrid at 0x175a75460>
No description has been provided for this image
In [78]:
#box plot for outlier visualization
sns.set(style="whitegrid")
df.boxplot(figsize=(15,6))
Out[78]:
<Axes: >
No description has been provided for this image
In [82]:
#box plot
sns.set(style="whitegrid")


sns.boxplot(x=df['Insulin'])
plt.show()
sns.boxplot(x=df['BloodPressure'])
plt.show()
sns.boxplot(x=df['DiabetesPedigreeFunction'])
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [84]:
#outlier remove

Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1

print("---Q1--- \n",Q1)
print("\n---Q3--- \n",Q3)
print("\n---IQR---\n",IQR)
---Q1--- 
 Pregnancies                  1.00000
Glucose                     99.00000
BloodPressure               62.00000
SkinThickness                0.00000
Insulin                      0.00000
BMI                         27.30000
DiabetesPedigreeFunction     0.24375
Age                         24.00000
Outcome                      0.00000
Name: 0.25, dtype: float64

---Q3--- 
 Pregnancies                   6.00000
Glucose                     140.25000
BloodPressure                80.00000
SkinThickness                32.00000
Insulin                     127.25000
BMI                          36.60000
DiabetesPedigreeFunction      0.62625
Age                          41.00000
Outcome                       1.00000
Name: 0.75, dtype: float64

---IQR---
 Pregnancies                   5.0000
Glucose                      41.2500
BloodPressure                18.0000
SkinThickness                32.0000
Insulin                     127.2500
BMI                           9.3000
DiabetesPedigreeFunction      0.3825
Age                          17.0000
Outcome                       1.0000
dtype: float64
In [86]:
#outlier remove
df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape,df_out.shape
#more than 80 records deleted
Out[86]:
((768, 9), (639, 9))
In [88]:
#Scatter matrix after removing outlier
sns.set(style="ticks")
sns.pairplot(df_out, hue="Outcome")
plt.show()
No description has been provided for this image
In [90]:
# visualizing distribution of Y feature (predictive variable)
plt.figure(figsize=(10,7))
df_out.Outcome.value_counts().sort_index().plot.bar()
diabetic_rate = df_out.Outcome.mean()
plt.title(f"Overall diabetes diagnosis rate: {diabetic_rate:.2%}", size=17)
plt.xlabel('Is diabetic?', size=17)
plt.ylabel('Count of Patients', size=17)
Out[90]:
Text(0, 0.5, 'Count of Patients')
No description has been provided for this image
In [92]:
df.shape
Out[92]:
(768, 9)
In [94]:
df.head()
Out[94]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [96]:
#lets extract features and targets
X=df_out.drop(columns=['Outcome'])
y=df_out['Outcome']
In [98]:
#Splitting train test data 80 20 ratio
from sklearn.model_selection import train_test_split
In [100]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)
In [102]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape
Out[102]:
((511, 8), (128, 8), (511,), (128,))
In [104]:
from sklearn.metrics import confusion_matrix,accuracy_score,make_scorer
from sklearn.model_selection import cross_validate

def tn(y_test, y_pred): return confusion_matrix(y_test, y_train)[0, 0]
def fp(y_test, y_pred): return confusion_matrix(y_test, y_pred)[0, 1]
def fn(y_test, y_pred): return confusion_matrix(y_test, y_pred)[1, 0]
def tp(y_test, y_pred): return confusion_matrix(y_test, y_pred)[1, 1]
     
In [106]:
#cross validation purpose
scoring = {'accuracy': make_scorer(accuracy_score),'prec': 'precision'}
scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),
           'fp': make_scorer(fp), 'fn': make_scorer(fn)}

def display_result(result):
    print("TP: ",result['test_tp'])
    print("TN: ",result['test_tn'])
    print("FN: ",result['test_fn'])
    print("FP: ",result['test_fp'])
In [108]:
#Perform 2 sample z-test
from statsmodels.stats.weightstats import ztest as ztest
ztest(X_train['BMI'],X_train['Age'],value=0)
Out[108]:
(-1.0803107438930162, 0.2800038269669761)
In [112]:
#Lets build the model
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import metrics


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Fit the model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Predict
y_predict = model.predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_predict)

# Plot confusion matrix using ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

# Show the score
model_score = model.score(X_test, y_test)
print(f"Model Score: {model_score}")
Model Score: 0.765625
No description has been provided for this image
In [128]:
accuracy = metrics.accuracy_score(y_test, y_predict)
print("Accuracy: %.3f" % accuracy)

precision = metrics.precision_score(y_test, y_predict)
print("Precision: %.3f" % precision)

recall = metrics.recall_score(y_test, y_predict)
print("Recall: %.3f" % recall)

f1 = metrics.f1_score(y_test, y_predict)
print("F1 Score: %.3f" % f1)
Accuracy: 0.766
Precision: 0.643
Recall: 0.474
F1 Score: 0.545
In [154]:
pip install xgboost
Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (from xgboost) (1.26.4)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.12/site-packages (from xgboost) (1.13.1)
Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl (1.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 595.6 kB/s eta 0:00:0000:0100:01
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.
In [156]:
conda install -c conda-forge xgboost
Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _py-xgboost-mutex-2.0      |            cpu_0          12 KB  conda-forge
    ca-certificates-2024.8.30  |       hf0a4a13_0         155 KB  conda-forge
    certifi-2024.8.30          |     pyhd8ed1ab_0         160 KB  conda-forge
    conda-24.7.1               |  py312h81bd7bf_0         1.2 MB  conda-forge
    libcxx-18.1.8              |       h3ed4263_7         427 KB  conda-forge
    libexpat-2.6.2             |       hebf3989_0          62 KB  conda-forge
    libsqlite-3.46.0           |       hfb93653_0         811 KB  conda-forge
    libxgboost-2.1.1           |   cpu_h27903ac_2         1.3 MB  conda-forge
    libzlib-1.2.13             |       hfb2fe0b_6          46 KB  conda-forge
    llvm-openmp-18.1.8         |       hde57baf_1         270 KB  conda-forge
    openssl-3.3.2              |       h8359307_0         2.7 MB  conda-forge
    py-xgboost-2.1.1           | cpu_pyhb442362_2         132 KB  conda-forge
    python-3.12.2              |hdf0ec26_0_cpython        12.5 MB  conda-forge
    python_abi-3.12            |          5_cp312           6 KB  conda-forge
    xgboost-2.1.1              | cpu_pyhb8f9a19_2          15 KB  conda-forge
    zlib-1.2.13                |       hfb2fe0b_6          76 KB  conda-forge
    ------------------------------------------------------------
                                           Total:        19.8 MB

The following NEW packages will be INSTALLED:

  _py-xgboost-mutex  conda-forge/osx-arm64::_py-xgboost-mutex-2.0-cpu_0 
  libexpat           conda-forge/osx-arm64::libexpat-2.6.2-hebf3989_0 
  libsqlite          conda-forge/osx-arm64::libsqlite-3.46.0-hfb93653_0 
  libxgboost         conda-forge/osx-arm64::libxgboost-2.1.1-cpu_h27903ac_2 
  libzlib            conda-forge/osx-arm64::libzlib-1.2.13-hfb2fe0b_6 
  py-xgboost         conda-forge/noarch::py-xgboost-2.1.1-cpu_pyhb442362_2 
  python_abi         conda-forge/osx-arm64::python_abi-3.12-5_cp312 
  xgboost            conda-forge/noarch::xgboost-2.1.1-cpu_pyhb8f9a19_2 

The following packages will be UPDATED:

  ca-certificates    pkgs/main::ca-certificates-2024.7.2-h~ --> conda-forge::ca-certificates-2024.8.30-hf0a4a13_0 
  libcxx                pkgs/main::libcxx-14.0.6-h848a8c0_0 --> conda-forge::libcxx-18.1.8-h3ed4263_7 
  llvm-openmp        pkgs/main::llvm-openmp-14.0.6-hc6e570~ --> conda-forge::llvm-openmp-18.1.8-hde57baf_1 
  openssl              pkgs/main::openssl-3.0.15-h80987f9_0 --> conda-forge::openssl-3.3.2-h8359307_0 
  zlib                    pkgs/main::zlib-1.2.13-h18a0788_1 --> conda-forge::zlib-1.2.13-hfb2fe0b_6 

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi            pkgs/main/osx-arm64::certifi-2024.8.3~ --> conda-forge/noarch::certifi-2024.8.30-pyhd8ed1ab_0 
  conda              pkgs/main::conda-24.7.1-py312hca03da5~ --> conda-forge::conda-24.7.1-py312h81bd7bf_0 
  python                pkgs/main::python-3.12.4-h99e199e_1 --> conda-forge::python-3.12.2-hdf0ec26_0_cpython 



Downloading and Extracting Packages:
python-3.12.2        | 12.5 MB   |                                       |   0% 
openssl-3.3.2        | 2.7 MB    |                                       |   0% 

libxgboost-2.1.1     | 1.3 MB    |                                       |   0% 


conda-24.7.1         | 1.2 MB    |                                       |   0% 



libsqlite-3.46.0     | 811 KB    |                                       |   0% 




libcxx-18.1.8        | 427 KB    |                                       |   0% 





llvm-openmp-18.1.8   | 270 KB    |                                       |   0% 






certifi-2024.8.30    | 160 KB    |                                       |   0% 







ca-certificates-2024 | 155 KB    |                                       |   0% 








py-xgboost-2.1.1     | 132 KB    |                                       |   0% 









zlib-1.2.13          | 76 KB     |                                       |   0% 










libexpat-2.6.2       | 62 KB     |                                       |   0% 











libzlib-1.2.13       | 46 KB     |                                       |   0% 












xgboost-2.1.1        | 15 KB     |                                       |   0% 













_py-xgboost-mutex-2. | 12 KB     |                                       |   0% 














python-3.12.2        | 12.5 MB   |                                       |   0% 



libsqlite-3.46.0     | 811 KB    | 7                                     |   2% 
openssl-3.3.2        | 2.7 MB    | 2                                     |   1% 

python-3.12.2        | 12.5 MB   | 3                                     |   1% 
openssl-3.3.2        | 2.7 MB    | 6                                     |   2% 



libsqlite-3.46.0     | 811 KB    | #####1                                |  14% 

python-3.12.2        | 12.5 MB   | 7                                     |   2% 



libsqlite-3.46.0     | 811 KB    | ###############3                      |  41% 

python-3.12.2        | 12.5 MB   | #3                                    |   4% 

libxgboost-2.1.1     | 1.3 MB    | ######################4               |  61% 



python-3.12.2        | 12.5 MB   | #7                                    |   5% 
openssl-3.3.2        | 2.7 MB    | #4                                    |   4% 

libxgboost-2.1.1     | 1.3 MB    | #############################9        |  81% 



libsqlite-3.46.0     | 811 KB    | ###########################           |  73% 
openssl-3.3.2        | 2.7 MB    | #6                                    |   5% 

libxgboost-2.1.1     | 1.3 MB    | ##################################### | 100% 

libxgboost-2.1.1     | 1.3 MB    | ##################################### | 100% 
python-3.12.2        | 12.5 MB   | ##                                    |   6% 




libcxx-18.1.8        | 427 KB    | #3                                    |   4% 



libsqlite-3.46.0     | 811 KB    | ##############################6       |  83% 
python-3.12.2        | 12.5 MB   | ##3                                   |   6% 




libcxx-18.1.8        | 427 KB    | #####5                                |  15% 



python-3.12.2        | 12.5 MB   | ###                                   |   8% 
openssl-3.3.2        | 2.7 MB    | ###9                                  |  11% 




libcxx-18.1.8        | 427 KB    | ################6                     |  45% 
python-3.12.2        | 12.5 MB   | ###2                                  |   9% 





llvm-openmp-18.1.8   | 270 KB    | ##1                                   |   6% 


conda-24.7.1         | 1.2 MB    | 4                                     |   1% 




libcxx-18.1.8        | 427 KB    | ##################################### | 100% 




libcxx-18.1.8        | 427 KB    | ##################################### | 100% 






certifi-2024.8.30    | 160 KB    | ###7                                  |  10% 
openssl-3.3.2        | 2.7 MB    | ######3                               |  17% 






certifi-2024.8.30    | 160 KB    | ##################################### | 100% 


python-3.12.2        | 12.5 MB   | ####1                                 |  11% 





llvm-openmp-18.1.8   | 270 KB    | ###############3                      |  42% 





llvm-openmp-18.1.8   | 270 KB    | ########################1             |  65% 


conda-24.7.1         | 1.2 MB    | #####4                                |  15% 
python-3.12.2        | 12.5 MB   | ####5                                 |  12% 





llvm-openmp-18.1.8   | 270 KB    | ################################9     |  89% 


conda-24.7.1         | 1.2 MB    | #################3                    |  47% 





python-3.12.2        | 12.5 MB   | #####                                 |  14% 
openssl-3.3.2        | 2.7 MB    | #########                             |  24% 


python-3.12.2        | 12.5 MB   | #####6                                |  15% 








py-xgboost-2.1.1     | 132 KB    | ####4                                 |  12% 
openssl-3.3.2        | 2.7 MB    | ##########3                           |  28% 








python-3.12.2        | 12.5 MB   | ######1                               |  17% 







ca-certificates-2024 | 155 KB    | ###8                                  |  10% 
openssl-3.3.2        | 2.7 MB    | ###########1                          |  30% 







ca-certificates-2024 | 155 KB    | ###################1                  |  52% 







ca-certificates-2024 | 155 KB    | ##################################### | 100% 








py-xgboost-2.1.1     | 132 KB    | ##################################### | 100% 
python-3.12.2        | 12.5 MB   | ######5                               |  18% 










libexpat-2.6.2       | 62 KB     | #########5                            |  26% 









zlib-1.2.13          | 76 KB     | #######7                              |  21% 










libexpat-2.6.2       | 62 KB     | ##################################### | 100% 
python-3.12.2        | 12.5 MB   | #######                               |  19% 









zlib-1.2.13          | 76 KB     | ##################################### | 100% 









zlib-1.2.13          | 76 KB     | ##################################### | 100% 











libzlib-1.2.13       | 46 KB     | ############9                         |  35% 











libzlib-1.2.13       | 46 KB     | ##################################### | 100% 
python-3.12.2        | 12.5 MB   | #######5                              |  21% 













_py-xgboost-mutex-2. | 12 KB     | ##################################### | 100% 













_py-xgboost-mutex-2. | 12 KB     | ##################################### | 100% 












xgboost-2.1.1        | 15 KB     | ##################################### | 100% 












xgboost-2.1.1        | 15 KB     | ##################################### | 100% 
python-3.12.2        | 12.5 MB   | ########                              |  22% 














python_abi-3.12      | 6 KB      | ##################################### | 100% 














python_abi-3.12      | 6 KB      | ##################################### | 100% 
python-3.12.2        | 12.5 MB   | ########6                             |  23% 


python-3.12.2        | 12.5 MB   | #########3                            |  25% 
openssl-3.3.2        | 2.7 MB    | ##################9                   |  51% 
python-3.12.2        | 12.5 MB   | #########9                            |  27% 


python-3.12.2        | 12.5 MB   | ##########5                           |  29% 
python-3.12.2        | 12.5 MB   | ###########                           |  30% 
python-3.12.2        | 12.5 MB   | ###########7                          |  32% 
python-3.12.2        | 12.5 MB   | ############3                         |  33% 
python-3.12.2        | 12.5 MB   | ############7                         |  35% 
python-3.12.2        | 12.5 MB   | #############5                        |  37% 
python-3.12.2        | 12.5 MB   | ##############3                       |  39% 
openssl-3.3.2        | 2.7 MB    | ##############################4       |  82% 
python-3.12.2        | 12.5 MB   | ###############5                      |  42% 
python-3.12.2        | 12.5 MB   | ################                      |  43% 
python-3.12.2        | 12.5 MB   | #################2                    |  47% 
openssl-3.3.2        | 2.7 MB    | ##################################### | 100% 
                                                                                
                                                                                

                                                                                


                                                                                



                                                                                




                                                                                





                                                                                






                                                                                







                                                                                








                                                                                









                                                                                










                                                                                











                                                                                












                                                                                













                                                                                














                                                                                
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.
In [158]:
#XGBoost
from xgboost import XGBClassifier

xgb_model =XGBClassifier(gamma=0)
xgb_model.fit(X_train, y_train)
Out[158]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In [161]:
xgb_pred = xgb_model.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test, xgb_pred)))
#XGBoost seems to be doing well with an accuracy score of 0.7795.
Accuracy Score = 0.7265625
In [163]:
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))
[[73 17]
 [18 20]]
              precision    recall  f1-score   support

           0       0.80      0.81      0.81        90
           1       0.54      0.53      0.53        38

    accuracy                           0.73       128
   macro avg       0.67      0.67      0.67       128
weighted avg       0.72      0.73      0.73       128

In [165]:
xgb_model.feature_importances_
Out[165]:
array([0.10681806, 0.26690283, 0.07197934, 0.09486822, 0.08104164,
       0.15018146, 0.10262704, 0.12558141], dtype=float32)
In [167]:
(pd.Series(xgb_model.feature_importances_).plot(kind='barh'))
Out[167]:
<Axes: >
No description has been provided for this image
In [169]:
#Predction
#printing the prediction probabities for the the test data
print('Prediction Probabilities')
xgb_model.predict_proba(X_test)
     
Prediction Probabilities
Out[169]:
array([[9.92698014e-01, 7.30198948e-03],
       [9.98287678e-01, 1.71232747e-03],
       [9.98678267e-01, 1.32173719e-03],
       [9.62741137e-01, 3.72588895e-02],
       [9.95805860e-01, 4.19415860e-03],
       [9.99800384e-01, 1.99629183e-04],
       [1.69157982e-03, 9.98308420e-01],
       [9.95779276e-01, 4.22069523e-03],
       [6.82082772e-01, 3.17917198e-01],
       [9.93828118e-01, 6.17191056e-03],
       [1.81093395e-01, 8.18906605e-01],
       [6.96693301e-01, 3.03306669e-01],
       [9.96942639e-01, 3.05735902e-03],
       [2.92949259e-01, 7.07050741e-01],
       [9.61683869e-01, 3.83161381e-02],
       [1.66436434e-01, 8.33563566e-01],
       [5.87375700e-01, 4.12624300e-01],
       [4.59243536e-01, 5.40756464e-01],
       [9.22846317e-01, 7.71537125e-02],
       [9.92303431e-01, 7.69655732e-03],
       [4.71982360e-02, 9.52801764e-01],
       [9.95942473e-01, 4.05751402e-03],
       [8.86238933e-01, 1.13761090e-01],
       [4.71579313e-01, 5.28420687e-01],
       [8.06907177e-01, 1.93092853e-01],
       [2.81797826e-01, 7.18202174e-01],
       [7.76815891e-01, 2.23184139e-01],
       [8.83256972e-01, 1.16743043e-01],
       [9.99338388e-01, 6.61599974e-04],
       [9.88308847e-01, 1.16911745e-02],
       [9.88125741e-01, 1.18742343e-02],
       [9.96705711e-01, 3.29429726e-03],
       [9.98044312e-01, 1.95569103e-03],
       [1.18758321e-01, 8.81241679e-01],
       [3.25666785e-01, 6.74333215e-01],
       [9.89137471e-01, 1.08625386e-02],
       [9.85866129e-01, 1.41338641e-02],
       [9.99476731e-01, 5.23241761e-04],
       [9.99013960e-01, 9.86026134e-04],
       [9.73583639e-01, 2.64163371e-02],
       [5.72578907e-02, 9.42742109e-01],
       [9.97030139e-01, 2.96986313e-03],
       [1.24645114e-01, 8.75354886e-01],
       [8.22680950e-01, 1.77319065e-01],
       [9.99679565e-01, 3.20413412e-04],
       [9.56447899e-01, 4.35521156e-02],
       [1.15604341e-01, 8.84395659e-01],
       [4.43898380e-01, 5.56101620e-01],
       [9.64564025e-01, 3.54360007e-02],
       [7.71950305e-01, 2.28049681e-01],
       [4.06223476e-01, 5.93776524e-01],
       [9.97017503e-01, 2.98252259e-03],
       [9.62221980e-01, 3.77780497e-02],
       [9.87115860e-01, 1.28841205e-02],
       [9.92482126e-01, 7.51787145e-03],
       [9.71470773e-01, 2.85292454e-02],
       [2.41522193e-02, 9.75847781e-01],
       [5.61645985e-01, 4.38354015e-01],
       [6.84203386e-01, 3.15796584e-01],
       [9.92389381e-01, 7.61063257e-03],
       [9.80956137e-01, 1.90438889e-02],
       [6.91584468e-01, 3.08415532e-01],
       [9.93445873e-01, 6.55414443e-03],
       [8.46824646e-02, 9.15317535e-01],
       [6.86359048e-01, 3.13640922e-01],
       [9.89106834e-01, 1.08931940e-02],
       [1.31346583e-02, 9.86865342e-01],
       [9.97377694e-01, 2.62228516e-03],
       [9.07706857e-01, 9.22931656e-02],
       [9.84076560e-01, 1.59234330e-02],
       [9.06054616e-01, 9.39453840e-02],
       [6.35308027e-02, 9.36469197e-01],
       [3.59359920e-01, 6.40640080e-01],
       [9.84128118e-01, 1.58719067e-02],
       [9.76188898e-01, 2.38111205e-02],
       [1.32267296e-01, 8.67732704e-01],
       [9.99916494e-01, 8.34780512e-05],
       [9.99698997e-01, 3.00994056e-04],
       [9.36300278e-01, 6.36996925e-02],
       [8.14855099e-04, 9.99185145e-01],
       [1.40027404e-01, 8.59972596e-01],
       [1.83433175e-01, 8.16566825e-01],
       [9.92176473e-01, 7.82352127e-03],
       [9.22738433e-01, 7.72615969e-02],
       [7.93317556e-02, 9.20668244e-01],
       [7.03954339e-01, 2.96045661e-01],
       [9.94522274e-01, 5.47774415e-03],
       [4.80942488e-01, 5.19057512e-01],
       [8.43731403e-01, 1.56268626e-01],
       [1.10597014e-02, 9.88940299e-01],
       [1.41111612e-01, 8.58888388e-01],
       [9.18520510e-01, 8.14795047e-02],
       [9.79698181e-01, 2.03017965e-02],
       [9.40711617e-01, 5.92883751e-02],
       [4.03893709e-01, 5.96106291e-01],
       [6.05168939e-01, 3.94831091e-01],
       [8.15820932e-01, 1.84179068e-01],
       [6.85274839e-01, 3.14725190e-01],
       [6.28185332e-01, 3.71814668e-01],
       [3.16698074e-01, 6.83301926e-01],
       [9.44051385e-01, 5.59486113e-02],
       [9.98127282e-01, 1.87270797e-03],
       [9.77459550e-01, 2.25404389e-02],
       [9.84630167e-01, 1.53698223e-02],
       [8.09853554e-01, 1.90146461e-01],
       [7.96324253e-01, 2.03675762e-01],
       [2.09695697e-02, 9.79030430e-01],
       [9.98806477e-01, 1.19350385e-03],
       [9.37582970e-01, 6.24170527e-02],
       [2.94814408e-01, 7.05185592e-01],
       [9.35056090e-01, 6.49439394e-02],
       [5.82101226e-01, 4.17898804e-01],
       [8.78445029e-01, 1.21555001e-01],
       [9.97622252e-01, 2.37775035e-03],
       [9.32184637e-01, 6.78153485e-02],
       [7.75922000e-01, 2.24078014e-01],
       [9.89523232e-01, 1.04767652e-02],
       [1.01554394e-02, 9.89844561e-01],
       [2.14435935e-01, 7.85564065e-01],
       [9.80520129e-01, 1.94799006e-02],
       [1.81804895e-02, 9.81819510e-01],
       [6.28389716e-02, 9.37161028e-01],
       [9.95727003e-01, 4.27297084e-03],
       [9.26763833e-01, 7.32361525e-02],
       [9.17421699e-01, 8.25782716e-02],
       [5.73029280e-01, 4.26970750e-01],
       [9.91200626e-01, 8.79937690e-03],
       [2.17268288e-01, 7.82731712e-01]], dtype=float32)
In [ ]: